{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Writing A Stacking Agreggator with Scikit-Learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "\n",
    "cali_housing = fetch_california_housing()\n",
    "\n",
    "X = cali_housing.data\n",
    "y = cali_housing.target\n",
    "\n",
    "bins = np.arange(6)\n",
    " \n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "binned_y = np.digitize(y, bins)\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor\n",
    " \n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "X_train_prin, X_test_prin, y_train_prin, y_test_prin = train_test_split(X, y,test_size=0.2,stratify=binned_y)\n",
    "\n",
    "binned_y_train_prin = np.digitize(y_train_prin, bins)\n",
    "X_1, X_stack, y_1, y_stack = train_test_split(X_train_prin, y_train_prin,test_size=0.33,stratify=binned_y_train_prin )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=1.0,\n",
       "         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,\n",
       "         random_state=None, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=5, n_jobs=-1,\n",
       "          param_distributions={'max_features': [0.5, 1.0], 'max_samples': [0.5, 1.0], 'oob_score': [True, False], 'n_estimators': [100], 'base_estimator__n_neighbors': [3, 5]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import BaggingRegressor\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {\n",
    " 'max_samples': [0.5,1.0],\n",
    " 'max_features' : [0.5,1.0],\n",
    " 'oob_score' : [True, False],\n",
    " 'base_estimator__n_neighbors': [3,5],\n",
    " 'n_estimators': [100]\n",
    " }\n",
    "\n",
    "single_estimator = KNeighborsRegressor()\n",
    "ensemble_estimator = BaggingRegressor(base_estimator = single_estimator)\n",
    "\n",
    "pre_gs_inst_bag = RandomizedSearchCV(ensemble_estimator,\n",
    " param_distributions = param_dist,\n",
    " cv=3,\n",
    " n_iter = 5,\n",
    " n_jobs=-1)\n",
    "\n",
    "pre_gs_inst_bag.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=0.5,\n",
       "         max_samples=0.5, n_estimators=3000, n_jobs=1, oob_score=False,\n",
       "         random_state=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rs_bag = BaggingRegressor(**{'max_features': 0.5,\n",
    " 'max_samples': 0.5,\n",
    " 'n_estimators': 3000,\n",
    " 'oob_score': False, \n",
    " 'base_estimator': KNeighborsRegressor(n_neighbors=3)})\n",
    "\n",
    "rs_bag.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n",
       "             max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "             min_impurity_split=None, min_samples_leaf=1,\n",
       "             min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "             n_estimators=100, presort='auto', random_state=None,\n",
       "             subsample=1.0, verbose=0, warm_start=True),\n",
       "          fit_params=None, iid=True, n_iter=30, n_jobs=-1,\n",
       "          param_distributions={'loss': ['ls', 'huber'], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.275, 0.3, 0.325], 'min_samples_leaf': [1, 2, 3, 4, 5, 10], 'n_estimators': [50, 100], 'max_features': ['log2', 0.4, 0.5, 0.6, 1.0], 'max_depth': [2, 3, 4, 5, 6, 7, 10]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {'max_features' : ['log2',0.4,0.5,0.6,1.0],\n",
    " 'max_depth' : [2,3, 4, 5,6, 7, 10],\n",
    " 'min_samples_leaf' : [1,2, 3, 4, 5, 10],\n",
    " 'n_estimators': [50, 100],\n",
    " 'learning_rate' : [0.01,0.05,0.1,0.25,0.275,0.3,0.325],\n",
    " 'loss' : ['ls','huber']\n",
    " }\n",
    "pre_gs_inst = RandomizedSearchCV(GradientBoostingRegressor(warm_start=True),param_distributions = param_dist,\n",
    " cv=3,\n",
    " n_iter = 30, n_jobs=-1)\n",
    "pre_gs_inst.fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gbt_inst = GradientBoostingRegressor(**{'learning_rate': 0.05,\n",
    " 'loss': 'huber',\n",
    " 'max_depth': 10,\n",
    " 'max_features': 0.4,\n",
    " 'min_samples_leaf': 5,\n",
    " 'n_estimators': 3000,\n",
    " 'warm_start': True}).fit(X_1, y_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "y_pred_bag = rs_bag.predict(X_stack)\n",
    "y_pred_gbt = gbt_inst.predict(X_stack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.544276345578\n",
      "MAE :  0.599640103887\n",
      "MAPE :  0.395826764392\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print \"R-squared\",r2_score(y_stack, y_pred_bag)\n",
    "print \"MAE : \",mean_absolute_error(y_stack, y_pred_bag)\n",
    "print \"MAPE : \",(np.abs(y_stack- y_pred_bag)/y_stack).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.839682303333\n",
      "MAE :  0.29734891807\n",
      "MAPE :  0.167723289248\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print \"R-squared\",r2_score(y_stack, y_pred_gbt)\n",
    "print \"MAE : \",mean_absolute_error(y_stack, y_pred_gbt)\n",
    "print \"MAPE : \",(np.abs(y_stack- y_pred_gbt)/y_stack).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bag</th>\n",
       "      <th>gbt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.357244</td>\n",
       "      <td>3.031665</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.400237</td>\n",
       "      <td>3.080777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.070333</td>\n",
       "      <td>0.582493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.719097</td>\n",
       "      <td>1.068786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.382738</td>\n",
       "      <td>2.246161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.307577</td>\n",
       "      <td>2.227196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.977660</td>\n",
       "      <td>2.261220</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1.622037</td>\n",
       "      <td>1.254949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1.808480</td>\n",
       "      <td>1.593632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.671473</td>\n",
       "      <td>1.610620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1.719750</td>\n",
       "      <td>0.976184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1.739504</td>\n",
       "      <td>1.247797</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1.758707</td>\n",
       "      <td>1.546528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2.419770</td>\n",
       "      <td>3.809715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1.921614</td>\n",
       "      <td>1.708258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1.935681</td>\n",
       "      <td>1.965999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2.801091</td>\n",
       "      <td>4.823605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2.213974</td>\n",
       "      <td>1.811436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1.812747</td>\n",
       "      <td>1.532421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1.970867</td>\n",
       "      <td>1.765430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2.080933</td>\n",
       "      <td>2.503043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>1.441320</td>\n",
       "      <td>0.989363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1.693857</td>\n",
       "      <td>1.205200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1.759397</td>\n",
       "      <td>0.787499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1.555164</td>\n",
       "      <td>1.062193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>1.727504</td>\n",
       "      <td>0.695459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>2.244043</td>\n",
       "      <td>2.268217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1.850907</td>\n",
       "      <td>1.598836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>2.387277</td>\n",
       "      <td>3.789166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>2.749881</td>\n",
       "      <td>2.358799</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5419</th>\n",
       "      <td>1.439097</td>\n",
       "      <td>1.168499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5420</th>\n",
       "      <td>1.557687</td>\n",
       "      <td>0.871280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5421</th>\n",
       "      <td>1.530500</td>\n",
       "      <td>0.811253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5422</th>\n",
       "      <td>1.317707</td>\n",
       "      <td>0.895562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5423</th>\n",
       "      <td>2.129840</td>\n",
       "      <td>1.689821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5424</th>\n",
       "      <td>2.495690</td>\n",
       "      <td>2.781522</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5425</th>\n",
       "      <td>2.195624</td>\n",
       "      <td>1.662339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5426</th>\n",
       "      <td>1.746303</td>\n",
       "      <td>1.254423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5427</th>\n",
       "      <td>2.389191</td>\n",
       "      <td>3.033702</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5428</th>\n",
       "      <td>1.722743</td>\n",
       "      <td>1.079788</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5429</th>\n",
       "      <td>1.900083</td>\n",
       "      <td>1.877705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5430</th>\n",
       "      <td>2.417894</td>\n",
       "      <td>2.354172</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5431</th>\n",
       "      <td>1.516697</td>\n",
       "      <td>1.038459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5432</th>\n",
       "      <td>1.379997</td>\n",
       "      <td>1.020229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5433</th>\n",
       "      <td>2.094180</td>\n",
       "      <td>3.626962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5434</th>\n",
       "      <td>2.083683</td>\n",
       "      <td>1.901797</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5435</th>\n",
       "      <td>1.664363</td>\n",
       "      <td>1.956676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5436</th>\n",
       "      <td>1.484647</td>\n",
       "      <td>0.946978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5437</th>\n",
       "      <td>1.662230</td>\n",
       "      <td>1.844416</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5438</th>\n",
       "      <td>2.518264</td>\n",
       "      <td>2.454139</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5439</th>\n",
       "      <td>1.780433</td>\n",
       "      <td>1.423553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5440</th>\n",
       "      <td>1.681377</td>\n",
       "      <td>1.558085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5441</th>\n",
       "      <td>2.451137</td>\n",
       "      <td>2.612259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5442</th>\n",
       "      <td>2.079831</td>\n",
       "      <td>1.568341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5443</th>\n",
       "      <td>1.808457</td>\n",
       "      <td>1.479981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5444</th>\n",
       "      <td>2.132794</td>\n",
       "      <td>1.994456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5445</th>\n",
       "      <td>1.229540</td>\n",
       "      <td>0.768184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5446</th>\n",
       "      <td>3.766285</td>\n",
       "      <td>4.952524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5447</th>\n",
       "      <td>1.862344</td>\n",
       "      <td>1.331403</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5448</th>\n",
       "      <td>2.396340</td>\n",
       "      <td>2.174182</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5449 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           bag       gbt\n",
       "0     2.357244  3.031665\n",
       "1     2.400237  3.080777\n",
       "2     1.070333  0.582493\n",
       "3     1.719097  1.068786\n",
       "4     2.382738  2.246161\n",
       "5     2.307577  2.227196\n",
       "6     1.977660  2.261220\n",
       "7     1.622037  1.254949\n",
       "8     1.808480  1.593632\n",
       "9     1.671473  1.610620\n",
       "10    1.719750  0.976184\n",
       "11    1.739504  1.247797\n",
       "12    1.758707  1.546528\n",
       "13    2.419770  3.809715\n",
       "14    1.921614  1.708258\n",
       "15    1.935681  1.965999\n",
       "16    2.801091  4.823605\n",
       "17    2.213974  1.811436\n",
       "18    1.812747  1.532421\n",
       "19    1.970867  1.765430\n",
       "20    2.080933  2.503043\n",
       "21    1.441320  0.989363\n",
       "22    1.693857  1.205200\n",
       "23    1.759397  0.787499\n",
       "24    1.555164  1.062193\n",
       "25    1.727504  0.695459\n",
       "26    2.244043  2.268217\n",
       "27    1.850907  1.598836\n",
       "28    2.387277  3.789166\n",
       "29    2.749881  2.358799\n",
       "...        ...       ...\n",
       "5419  1.439097  1.168499\n",
       "5420  1.557687  0.871280\n",
       "5421  1.530500  0.811253\n",
       "5422  1.317707  0.895562\n",
       "5423  2.129840  1.689821\n",
       "5424  2.495690  2.781522\n",
       "5425  2.195624  1.662339\n",
       "5426  1.746303  1.254423\n",
       "5427  2.389191  3.033702\n",
       "5428  1.722743  1.079788\n",
       "5429  1.900083  1.877705\n",
       "5430  2.417894  2.354172\n",
       "5431  1.516697  1.038459\n",
       "5432  1.379997  1.020229\n",
       "5433  2.094180  3.626962\n",
       "5434  2.083683  1.901797\n",
       "5435  1.664363  1.956676\n",
       "5436  1.484647  0.946978\n",
       "5437  1.662230  1.844416\n",
       "5438  2.518264  2.454139\n",
       "5439  1.780433  1.423553\n",
       "5440  1.681377  1.558085\n",
       "5441  2.451137  2.612259\n",
       "5442  2.079831  1.568341\n",
       "5443  1.808457  1.479981\n",
       "5444  2.132794  1.994456\n",
       "5445  1.229540  0.768184\n",
       "5446  3.766285  4.952524\n",
       "5447  1.862344  1.331403\n",
       "5448  2.396340  2.174182\n",
       "\n",
       "[5449 rows x 2 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_bag = pre_gs_inst_bag.predict(X_stack)\n",
    "y_pred_gbt = gbt_inst.predict(X_stack)\n",
    "\n",
    "preds_df = pd.DataFrame(columns = ['bag', 'gbt'])\n",
    "\n",
    "preds_df['bag'] = y_pred_bag\n",
    "preds_df['gbt'] = y_pred_gbt\n",
    "\n",
    "\n",
    "preds_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bag</th>\n",
       "      <th>gbt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>bag</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.906329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gbt</th>\n",
       "      <td>0.906329</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          bag       gbt\n",
       "bag  1.000000  0.906329\n",
       "gbt  0.906329  1.000000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_df.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "          max_features='auto', max_leaf_nodes=None,\n",
       "          min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "          min_samples_leaf=1, min_samples_split=2,\n",
       "          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "          oob_score=False, random_state=None, verbose=0, warm_start=True),\n",
       "          fit_params=None, iid=True, n_iter=15, n_jobs=1,\n",
       "          param_distributions={'max_features': ['sqrt', 'log2', 1.0], 'n_estimators': [50, 100], 'oob_score': [True, False], 'min_samples_leaf': [1, 2, 3, 7, 11]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {'max_features' : ['sqrt','log2',1.0],\n",
    " 'min_samples_leaf' : [1, 2, 3, 7, 11],\n",
    " 'n_estimators': [50, 100],\n",
    " 'oob_score': [True, False]}\n",
    "\n",
    "pre_gs_inst = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True),\n",
    " param_distributions = param_dist,\n",
    " cv=3,\n",
    " n_iter = 15)\n",
    "\n",
    "pre_gs_inst.fit(preds_df.values, y_stack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'bootstrap': True,\n",
       " 'max_features': 'sqrt',\n",
       " 'min_samples_leaf': 2,\n",
       " 'n_estimators': 2000,\n",
       " 'n_jobs': -1,\n",
       " 'oob_score': False,\n",
       " 'warm_start': True}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import copy\n",
    " \n",
    "param_dict = copy.deepcopy(pre_gs_inst.best_params_)\n",
    " \n",
    "param_dict['n_estimators'] = 2000\n",
    "param_dict['warm_start'] = True\n",
    "param_dict['bootstrap'] = True\n",
    "param_dict['n_jobs'] = -1\n",
    " \n",
    "param_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "          max_features='sqrt', max_leaf_nodes=None,\n",
       "          min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "          min_samples_leaf=2, min_samples_split=2,\n",
       "          min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,\n",
       "          oob_score=False, random_state=None, verbose=0, warm_start=True)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_etr = ExtraTreesRegressor(**param_dict)\n",
    "final_etr.fit(preds_df.values, y_stack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def handle_X_set(X_train_set):\n",
    "    y_pred_bag = rs_bag.predict(X_train_set)\n",
    "    y_pred_gbt = gbt_inst.predict(X_train_set)\n",
    "    preds_df = pd.DataFrame(columns = ['bag', 'gbt'])\n",
    "\n",
    "    preds_df['bag'] = y_pred_bag\n",
    "    preds_df['gbt'] = y_pred_gbt\n",
    " \n",
    "    return preds_df.values\n",
    "\n",
    "def predict_from_X_set(X_train_set):\n",
    "    return final_etr.predict(handle_X_set(X_train_set)) \n",
    "\n",
    "y_pred = predict_from_X_set(X_test_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.813377711684\n",
      "MAE :  0.329035411313\n",
      "MAPE :  0.18643146133\n"
     ]
    }
   ],
   "source": [
    "# It is possible to increase score by including the data columns alongside the stacker predictions\n",
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print \"R-squared\",r2_score(y_test_prin, y_pred)\n",
    "print \"MAE : \",mean_absolute_error(y_test_prin, y_pred)\n",
    "print \"MAPE : \",(np.abs(y_test_prin- y_pred)/y_test_prin).mean()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
